This script measure teh detachment score for genes and it has been used to emasure NL association score for ABCB1 in many differnt conditions performe by Anna G. Manjon at B5 nzlae
suppressMessages(suppressWarnings(library(GenomicRanges)))
suppressMessages(suppressWarnings(library(RColorBrewer)))
suppressMessages(suppressWarnings(library(reshape2)))
suppressMessages(suppressWarnings(library(ggbeeswarm)))
suppressMessages(suppressWarnings(library(limma)))
suppressMessages(suppressWarnings(library(edgeR)))
suppressMessages(suppressWarnings(library(grid)))
library(VennDiagram)
## Loading required package: futile.logger
library(stringr)
library(khroma)
library(dtplyr)
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("ggplot2")
library("rtracklayer")
library("ggpubr")
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:VennDiagram':
##
## rotate
library("ggrastr")
#prepare output
output_dir<- "/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/SGM20220725_Anna_New_Taxol_Clones"
dir.create(output_dir, showWarnings = FALSE)
# Parameter to extend bins to a certain distance (bases) for a more accurate
# lamina interaction score
ext<- 1e4
#Get the sample data frame prepared
input_dir<- "/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/results/counts/bin-gatc"
#Use samples from a previous document
samples <-list.files(path = "/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/results/counts/bin-gatc", pattern = "Lmnb2")
#samples_Tom <-list.files(path = "/DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/tracks/normalized/bin-20kb", pattern="LaminB")
samples <-as.data.frame(samples)
#samples$sample <- gsub("-10kb", "-gatc", samples$sample)
samples.lmnb2<- samples$sample
n<- nrow(samples)
# Read Chromosome size
chrom_sizes<-read.table("/DATA/scratch/usr/t.v.schaik/data/genomes/GRCh38/hg38.chrom.sizes")
names(chrom_sizes)<- c("seqnames", "length")
row.names(chrom_sizes)<-chrom_sizes$seqnames
#Prepare output files
df.count.file<- file.path(output_dir, "df_count.rds")
genes.file<- file.path(output_dir, "genes.rds")
gene.counts.file<- file.path(output_dir, "genes_counts.rds")
### 1) Count reads in genes
# ts220203 - improve Stefano's script
# New code. Changes:
# - only read data if output .rds file does not exist
# - change to read.table to read_tsv, which should be faster
# 1) Read count data frame
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ purrr 0.3.5
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ✔ readr 2.1.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse() masks IRanges::collapse()
## ✖ dplyr::combine() masks BiocGenerics::combine()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ dplyr::slice() masks IRanges::slice()
read_sample_and_dam <- function(x,
input_dir = "/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/results/counts/bin-gatc") {
# read sample and dam separately
sample <- read_tsv(file.path(input_dir,
x),
show_col_types = F,
col_names = c("seqnames",
"start",
"end",
x),
col_types = c(seqnames = "-",
start = "-",
end = "-"))
dam <- read_tsv(file.path(input_dir,
gsub("Lmnb2", "Dam", x)),
show_col_types = F,
col_names = c("seqnames",
"start",
"end",
paste0(x, "_Dam")),
col_types = c(seqnames = "-",
start = "-",
end = "-"))
# combine
sample <- bind_cols(sample, dam)
# and return
sample
}
if (! file.exists(df.count.file)) {
# first, load bins
# add +1 to the start, as this was bed-based
bins <- read_tsv(file.path(input_dir,
samples.lmnb2[1]),
col_names = c("seqnames",
"start",
"end")) %>%
dplyr::select(1:3) %>%
mutate(start = start + 1)
# load counts
tib_count <- purrr::map(samples.lmnb2,
read_sample_and_dam) %>%
purrr::reduce(bind_cols)
# combine the two
tib_count <- bind_cols(bins, tib_count)
# convert to GRanges and set seqlevels
df.count <- as(tib_count, "GRanges")
# Set seqlevels
seqlevels(df.count, pruning = "coarse") <- c(paste0("chr", 1:22), "chrX")
seqlengths(df.count) <- chrom_sizes[seqlevels(df.count), "length"]
saveRDS(df.count, df.count.file)
} else {
df.count <- readRDS(df.count.file)
}
#I need the gene locations to count the data
genes<- import("/DATA/scratch/usr/t.v.schaik/data/gene_builds/GRCh38/gencode.v24.primary_assembly.annotation.gtf")
genes <- genes[genes$type=="gene"]
#Filter for protein coding genes
genes<-genes[genes$gene_type == "protein_coding"]
#filter for chromosomes
genes <- genes[seqnames(genes) %in% c(paste0("chr", 1:22), "chrX")]
#Set seqlevels
seqlevels(genes, pruning = "coarse") <- c(paste0("chr", 1:22),"chrX")
seqlengths(genes)<- chrom_sizes[seqlevels(genes), "length"]
#First extend the genes
genes.query <- genes
mcols(genes.query)<- NULL
start(genes.query)<- start(genes.query) - ext
end(genes.query)<- end(genes.query) + ext
#Overlap genes with the counts
ovl<-findOverlaps(genes.query, df.count)
#Get the number of GATC fragment for every gene query
mcols(genes) [,"GATC_fragments"]<- 0
t <- table(queryHits(ovl))
mcols(genes)[as.integer(names(t)), "GATC_fragments"] <- as.numeric(t)
#Get the summed signal for every gene
gene.counts<- genes
mcols(gene.counts) <-NULL
#mcols(gene.counts)[, names(mcols(df.count))]<-NA
# Faster tibble/dtplyr implementation
tib <- lazy_dt(as.tibble(ovl)) %>% #as_tibble()
rename_all(~ c("idx_genes", "idx_gatc")) %>%
as_tibble()
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
tib <- bind_cols(tib,
as.tibble(mcols(df.count[tib$idx_gatc]))) #as_tibble()
tib_summarise <- tib %>%
gather(key, value, -contains("idx"))
tib_summarise <- lazy_dt(tib_summarise) %>%
group_by(idx_genes, key) %>%
summarise(count = sum(value)) %>%
ungroup() %>%
mutate(key = factor(key, levels = str_replace_all(names(mcols(df.count)), "-", "."))) %>%
as.tibble() #as_tibble()
## `summarise()` has grouped output by 'idx_genes'. You can override using the
## `.groups` argument.
tib_genes <- tib_summarise %>%
spread(key, count)
mcols(gene.counts) <- tib_genes %>%
dplyr::select(-idx_genes)
names(mcols(gene.counts)) <- names(mcols(df.count))
saveRDS(genes, genes.file)
saveRDS(gene.counts, gene.counts.file)
df.counts<-readRDS(df.count.file)
genes<- readRDS(genes.file)
gene.counts<- readRDS(gene.counts.file)
Next, we would like to normalize the gene counts for various parameters:
Absolute read filtering
# To-do: filter genes for having too few reads?
# alternatively, filter for number of GATC fragments overlapping a gene?
idx<- rowSums(as(mcols(gene.counts), "data.frame") >=10) >=2
gene.counts <- gene.counts[idx]
genes <- genes[idx]
The library size
library_size <- colSums(as(mcols(df.count), "data.frame"))
library_size
## pADamID-DPURO3_r1_Lmnb2-gatc.counts.txt.gz
## 9146819
## pADamID-DPURO3_r1_Lmnb2-gatc.counts.txt.gz_Dam
## 11628140
## pADamID-DPURO3_r2_Lmnb2-gatc.counts.txt.gz
## 16015780
## pADamID-DPURO3_r2_Lmnb2-gatc.counts.txt.gz_Dam
## 15530112
## pADamID-iCA3_c7_r1_Lmnb2-gatc.counts.txt.gz
## 23983578
## pADamID-iCA3_c7_r1_Lmnb2-gatc.counts.txt.gz_Dam
## 20015832
## pADamID-iCA3_c7_r2_Lmnb2-gatc.counts.txt.gz
## 23754126
## pADamID-iCA3_c7_r2_Lmnb2-gatc.counts.txt.gz_Dam
## 12392542
## pADamID-iCA3_c7_TXR_r1_Lmnb2-gatc.counts.txt.gz
## 16280849
## pADamID-iCA3_c7_TXR_r1_Lmnb2-gatc.counts.txt.gz_Dam
## 19601103
## pADamID-iCA3_c7_TXR_r2_Lmnb2-gatc.counts.txt.gz
## 26154121
## pADamID-iCA3_c7_TXR_r2_Lmnb2-gatc.counts.txt.gz_Dam
## 26154121
## pADamID-iCA3P_r1_Lmnb2-gatc.counts.txt.gz
## 16154243
## pADamID-iCA3P_r1_Lmnb2-gatc.counts.txt.gz_Dam
## 20956293
## pADamID-iCA3P_r2_Lmnb2-gatc.counts.txt.gz
## 17242299
## pADamID-iCA3P_r2_Lmnb2-gatc.counts.txt.gz_Dam
## 16661470
## pADamID-iCA3P_TXR_r1_Lmnb2-gatc.counts.txt.gz
## 23549410
## pADamID-iCA3P_TXR_r1_Lmnb2-gatc.counts.txt.gz_Dam
## 19472337
## pADamID-iCA3P_TXR_r2_Lmnb2-gatc.counts.txt.gz
## 33529924
## pADamID-iCA3P_TXR_r2_Lmnb2-gatc.counts.txt.gz_Dam
## 18239916
## pADamID-RPE0_r1_Lmnb2-gatc.counts.txt.gz
## 13791640
## pADamID-RPE0_r1_Lmnb2-gatc.counts.txt.gz_Dam
## 14815531
## pADamID-RPE0_r2_Lmnb2-gatc.counts.txt.gz
## 18837611
## pADamID-RPE0_r2_Lmnb2-gatc.counts.txt.gz_Dam
## 18477002
## pADamID-TXR3_r1_Lmnb2-gatc.counts.txt.gz
## 13539117
## pADamID-TXR3_r1_Lmnb2-gatc.counts.txt.gz_Dam
## 11677399
## pADamID-TXR3_r2_Lmnb2-gatc.counts.txt.gz
## 15468417
## pADamID-TXR3_r2_Lmnb2-gatc.counts.txt.gz_Dam
## 16265507
## pADamID-TXR4_r1_Lmnb2-gatc.counts.txt.gz
## 14819533
## pADamID-TXR4_r1_Lmnb2-gatc.counts.txt.gz_Dam
## 13951858
## pADamID-TXR4_r2_Lmnb2-gatc.counts.txt.gz
## 15712352
## pADamID-TXR4_r2_Lmnb2-gatc.counts.txt.gz_Dam
## 14335555
## pADamID-TXR5_r1_Lmnb2-gatc.counts.txt.gz
## 14514591
## pADamID-TXR5_r1_Lmnb2-gatc.counts.txt.gz_Dam
## 13510895
## pADamID-TXR5_r2_Lmnb2-gatc.counts.txt.gz
## 16824129
## pADamID-TXR5_r2_Lmnb2-gatc.counts.txt.gz_Dam
## 13196807
## pADamID-TXR6_r1_Lmnb2-gatc.counts.txt.gz
## 19363471
## pADamID-TXR6_r1_Lmnb2-gatc.counts.txt.gz_Dam
## 16382760
## pADamID-TXR6_r2_Lmnb2-gatc.counts.txt.gz
## 17710880
## pADamID-TXR6_r2_Lmnb2-gatc.counts.txt.gz_Dam
## 14072587
# Also combined the counts (Dam + lamina) and have them separate
gene.counts.combined<- gene.counts.lamina <- gene.counts.dam<- gene.counts
mcols(gene.counts.combined) <- (as(mcols(gene.counts)[, seq(1, 2*n, 2)], "data.frame")+
as(mcols(gene.counts)[, seq(2, 2*n, 2)], "data.frame"))
mcols(gene.counts.lamina) <-mcols(gene.counts)[,seq(1, 2*n,2)]
mcols(gene.counts.dam) <-mcols(gene.counts)[,seq(2, 2*n,2)]
#Normalize to 1 M reads
norm_factor <- library_size/ 1e6
gene.counts.norm<- gene.counts
mcols(gene.counts.norm)<- t(t(as(mcols(gene.counts), "data.frame"))/ norm_factor)
#Also create a 1M mean counts of Dam and Lamina per experiment
gene.counts.norm.combined<- gene.counts.norm
mcols(gene.counts.norm.combined) <- (as(mcols(gene.counts.norm) [,seq(1, 2*n, 2)], "data.frame")+
as(mcols(gene.counts.norm) [,seq(2, 2*n, 2)], "data.frame")) /2
# Normalize Lamina over Dam function (ratio2)
NormalizeDamID <- function(df, pseudo = 1) {
# This function expects a data frame that is composed of lamina and Dam-only
# columns, in that order. It simply determines the log2 ratio of the two for
# every pair of columns. A pseudocount is added to prevent problems.
n <- ncol(df)
df.norm <- log2((df[, seq(1, n-1, 2)] + pseudo) / (df[, seq(2, n, 2)] + pseudo))
df.norm
}
#Normalize LB over Dam (ratio2)
genes.norm <- gene.counts
mcols(genes.norm) <- NormalizeDamID(as(mcols(gene.counts.norm), "data.frame"))
mcols(genes.norm) <- mcols(genes.norm)[, samples$sample]
# Plot the distributions for the various samples This does not work
#plot(1, 1, type = "n", xlim = c(-6, 4), ylim = c(0, 0.6),
# xlab = "Dam-ratio (log2)", ylab = "density", main = "Density Dam ratio for genes")
#for (i in 1:n) {
# lines(density(mcols(genes.norm)[, i],na.rm = TRUE,), col = samples, lwd = 2,
# lty = as.numeric(samples.df$integration[i]),)
#}
#legend("topright", legend = levels(samples),
# pch = 19, col = 1:length(levels(samples)))
#Scale the data to the same mean and stdev
genes.norm.scaled<-genes.norm
mcols(genes.norm.scaled)<- scale(as(mcols(genes.norm.scaled), "data.frame"))
# Save files as RDS
saveRDS(samples, file.path(output_dir, "samples.rds"))
saveRDS(genes.norm.scaled, file.path(output_dir, "genes_norm_filtered.rds"))
# First, change the mcols of the genes
mcols(genes) <- mcols(genes)[, c("gene_id", "gene_name", "GATC_fragments")]
saveRDS(genes, file.path(output_dir, "genes_filtered.rds"))
# Samples data frame - basically metadata
padamid.samples<- readRDS(file.path(output_dir,
"samples.rds"))
# Filtered genes used in the pA-DamID analysis
padamid.genes <- readRDS(file.path(output_dir,
"genes_filtered.rds"))
# Data frame with per-gene information on differential analysis
#padamid.results <- readRDS(file.path(output_dir,
# "Differential_score.rds"))
padamid.norm <- readRDS(file.path(output_dir,
"genes_norm_filtered.rds"))
names(mcols(padamid.norm))<-padamid.samples$samples
class(padamid.norm)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
#class(padamid.results)
class(padamid.genes)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
class(padamid.samples)
## [1] "data.frame"
padamid_norm_df<- as(mcols(padamid.norm), "data.frame")
padamid_genes_df<- as(mcols(padamid.genes), "data.frame")
#binding gene name and danID score
damid_score_gene<-cbind(padamid_genes_df, padamid_norm_df)
Taxol_damid_score_gene<-damid_score_gene %>%mutate(DPURO_3=(`pADamID-DPURO3_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-DPURO3_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
mutate(TXR3=(`pADamID-TXR3_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-TXR3_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
mutate(TXR4=(`pADamID-TXR4_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-TXR4_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
mutate(RPE0=(`pADamID-RPE0_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-RPE0_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
mutate(TXR5=(`pADamID-TXR5_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-TXR5_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
mutate(TXR6=(`pADamID-TXR6_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-TXR6_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
mutate(ANCHOR_P=(`pADamID-iCA3P_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-iCA3P_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
mutate(ANCHOR_P_TXR=(`pADamID-iCA3P_TXR_r1_Lmnb2-gatc.counts.txt.gz`+ `pADamID-iCA3P_TXR_r2_Lmnb2-gatc.counts.txt.gz`)/2) %>%
dplyr::rename(ANCHOR_c7_r1=`pADamID-iCA3_c7_r1_Lmnb2-gatc.counts.txt.gz`)%>%
dplyr::rename(ANCHOR_c7_TXR_r1=`pADamID-iCA3_c7_TXR_r1_Lmnb2-gatc.counts.txt.gz`)
ABCB1<-Taxol_damid_score_gene%>%filter(gene_name=="ABCB1")
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_TXR3.pdf")
Taxol_damid_score_gene%>% ggplot(., aes(x= DPURO_3, y= TXR3))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_Clone3,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= DPURO_3, y= TXR3), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_TXR4.pdf")
Taxol_damid_score_gene%>% ggplot(., aes(x= DPURO_3, y= TXR4))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_Clone4,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= DPURO_3, y= TXR4), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_TXR5.pdf")
Taxol_damid_score_gene%>% ggplot(., aes(x= RPE0, y= TXR5))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_Clone5,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE0, y= TXR5), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_TXR6.pdf")
Taxol_damid_score_gene%>% ggplot(., aes(x= RPE0, y= TXR6))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_Clone6,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE0, y= TXR6), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_Anchor3_parental.pdf")
Taxol_damid_score_gene%>% ggplot(., aes(x= ANCHOR_P, y= ANCHOR_P_TXR))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_Anchor_Parental,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= ANCHOR_P, y= ANCHOR_P_TXR), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_Anchor3_Clone7.pdf")
Taxol_damid_score_gene%>% ggplot(., aes(x= ANCHOR_c7_r1, y= ANCHOR_c7_TXR_r1))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_Anchor_clone 7,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= ANCHOR_c7_r1, y= ANCHOR_c7_TXR_r1), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)
Now I want to use old data from Anna and Tom to have an overview of all TXR clones
#prepare output
output_dir<- "/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/SGM20220725_Anna_New_Taxol_Clones"
dir.create(output_dir, showWarnings = FALSE)
# Parameter to extend bins to a certain distance (bases) for a more accurate
# lamina interaction score
ext<- 1e4
#Get the ample data frame prepared
input_dir<- "/DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/counts/bin-gatc"
#Use samples from a previous document
#samples <-list.files(path = "/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/results/counts/bin-gatc", pattern = "Lmnb2")
samples_Tom <-list.files(path = "/DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/counts/bin-gatc", pattern="LaminB2")
samples_Tom <-as.data.frame(samples_Tom)
#samples_Tom$sample <- gsub("-10kb", "-gatc", samples_Tom$sample)
samples_Tom.lmnb2<- samples_Tom$sample
n<- nrow(samples_Tom)
# Read Chromosome size
chrom_sizes<-read.table("/DATA/scratch/usr/t.v.schaik/data/genomes/GRCh38/hg38.chrom.sizes")
names(chrom_sizes)<- c("seqnames", "length")
row.names(chrom_sizes)<-chrom_sizes$seqnames
#Prepare output files
df.count_Tom.file<- file.path(output_dir, "df_count_Tom.rds")
genes_Tom.file<- file.path(output_dir, "genes_Tom.rds")
gene_Tom.counts.file<- file.path(output_dir, "genes_Tom_counts.rds")
### 1) Count reads in genes_Tom
# ts220203 - improve Stefano's script
# New code. Changes:
# - only read data if output .rds file does not exist
# - change to read.table to read_tsv, which should be faster
# 1) Read count data frame
library(tidyverse)
read_sample_and_dam <- function(x,
input_dir = "/DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/counts/bin-gatc") {
# read sample and dam separately
sample <- read_tsv(file.path(input_dir,
x),
show_col_types = F,
col_names = c("seqnames",
"start",
"end",
x),
col_types = c(seqnames = "-",
start = "-",
end = "-"))
dam <- read_tsv(file.path(input_dir,
gsub("LaminB2", "Dam", x)),
show_col_types = F,
col_names = c("seqnames",
"start",
"end",
paste0(x, "_Dam")),
col_types = c(seqnames = "-",
start = "-",
end = "-"))
# combine
sample <- bind_cols(sample, dam)
# and return
sample
}
if (! file.exists(df.count_Tom.file)) {
# first, load bins
# add +1 to the start, as this was bed-based
bins <- read_tsv(file.path(input_dir,
samples_Tom.lmnb2[1]),
col_names = c("seqnames",
"start",
"end")) %>%
dplyr::select(1:3) %>%
mutate(start = start + 1)
# load counts
tib_count <- purrr::map(samples_Tom.lmnb2,
read_sample_and_dam) %>%
purrr::reduce(bind_cols)
# combine the two
tib_count <- bind_cols(bins, tib_count)
# convert to GRanges and set seqlevels
df.count_Tom <- as(tib_count, "GRanges")
# Set seqlevels
seqlevels(df.count_Tom, pruning = "coarse") <- c(paste0("chr", 1:22), "chrX")
seqlengths(df.count_Tom) <- chrom_sizes[seqlevels(df.count_Tom), "length"]
saveRDS(df.count_Tom, df.count_Tom.file)
} else {
df.count_Tom <- readRDS(df.count_Tom.file)
}
/DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/counts/bin-20kb/A549_61_LaminB2_R1-20kb.counts.txt.gz
#I need the gene locations to count the data
genes_Tom<- import("/DATA/scratch/usr/t.v.schaik/data/gene_builds/GRCh38/gencode.v24.primary_assembly.annotation.gtf")
genes_Tom <- genes_Tom[genes_Tom$type=="gene"]
#Filter for protein coding genes_Tom
genes_Tom<-genes_Tom[genes_Tom$gene_type == "protein_coding"]
#filter for chromosomes
genes_Tom <- genes_Tom[seqnames(genes_Tom) %in% c(paste0("chr", 1:22), "chrX")]
#Set seqlevels
seqlevels(genes_Tom, pruning = "coarse") <- c(paste0("chr", 1:22),"chrX")
seqlengths(genes_Tom)<- chrom_sizes[seqlevels(genes_Tom), "length"]
#First extend the genes_Tom
genes_Tom.query <- genes_Tom
mcols(genes_Tom.query)<- NULL
start(genes_Tom.query)<- start(genes_Tom.query) - ext
end(genes_Tom.query)<- end(genes_Tom.query) + ext
#Overlap genes_Tom with the counts
ovl<-findOverlaps(genes_Tom.query, df.count_Tom)
#Get the number of GATC fragment for every gene query
mcols(genes_Tom) [,"GATC_fragments"]<- 0
t <- table(queryHits(ovl))
mcols(genes_Tom)[as.integer(names(t)), "GATC_fragments"] <- as.numeric(t)
#Get the summed signal for every gene
gene_Tom.counts<- genes_Tom
mcols(gene_Tom.counts) <-NULL
#mcols(gene.counts)[, names(mcols(df.count_Tom))]<-NA
# Faster tibble/dtplyr implementation
tib <- lazy_dt(as.tibble(ovl)) %>% #as_tibble()
rename_all(~ c("idx_genes_Tom", "idx_gatc")) %>%
as_tibble()
tib <- bind_cols(tib,
as.tibble(mcols(df.count_Tom[tib$idx_gatc]))) #as_tibble()
tib_summarise <- tib %>%
gather(key, value, -contains("idx"))
tib_summarise <- lazy_dt(tib_summarise) %>%
group_by(idx_genes_Tom, key) %>%
summarise(count = sum(value)) %>%
ungroup() %>%
mutate(key = factor(key, levels = str_replace_all(names(mcols(df.count_Tom)), "-", "."))) %>%
as.tibble() #as_tibble()
## `summarise()` has grouped output by 'idx_genes_Tom'. You can override using the
## `.groups` argument.
tib_genes_Tom <- tib_summarise %>%
spread(key, count)
mcols(gene_Tom.counts) <- tib_genes_Tom %>%
dplyr::select(-idx_genes_Tom)
names(mcols(gene_Tom.counts)) <- names(mcols(df.count_Tom))
saveRDS(genes_Tom, genes_Tom.file)
saveRDS(gene_Tom.counts, gene_Tom.counts.file)
df.count_Tom.file<- file.path(output_dir, “df_count_Tom.rds”) genes_Tom.file<- file.path(output_dir, “genes_Tom.rds”) gene_Tom.counts.file<- file.path(output_dir, “genes_Tom_counts.rds”)
df.counts_Tom<-readRDS(df.count_Tom.file)
genes_Tom<- readRDS(genes_Tom.file)
genes_Tom.count<- readRDS(gene_Tom.counts.file)
Next, we would like to normalize the gene counts for various parameters:
Absolute read filtering
# To-do: filter genes_Tom for having too few reads?
# alternatively, filter for number of GATC fragments overlapping a gene?
idx<- rowSums(as(mcols(genes_Tom.count), "data.frame") >=10) >=2
genes_Tom.count <- genes_Tom.count[idx]
genes_Tom <- genes_Tom[idx]
The library size
library_size <- colSums(as(mcols(df.count_Tom), "data.frame"))
library_size
## A549_61_LaminB2_R1-gatc.counts.txt.gz
## 7128669
## A549_61_LaminB2_R1-gatc.counts.txt.gz_Dam
## 5070144
## A549_61_LaminB2_R2-gatc.counts.txt.gz
## 5287346
## A549_61_LaminB2_R2-gatc.counts.txt.gz_Dam
## 4105635
## A549_71_LaminB2_R1-gatc.counts.txt.gz
## 3413463
## A549_71_LaminB2_R1-gatc.counts.txt.gz_Dam
## 3808615
## A549_71_LaminB2_R2-gatc.counts.txt.gz
## 9883893
## A549_71_LaminB2_R2-gatc.counts.txt.gz_Dam
## 4371685
## RPE_CRISPRa_ABCB1_LaminB1_R1-gatc.counts.txt.gz
## 6648108
## RPE_CRISPRa_ABCB1_LaminB1_R1-gatc.counts.txt.gz_Dam
## 6648108
## RPE_CRISPRa_ABCB1_LaminB1_R2-gatc.counts.txt.gz
## 6901312
## RPE_CRISPRa_ABCB1_LaminB1_R2-gatc.counts.txt.gz_Dam
## 6901312
## RPE_CRISPRa_ABCB4_LaminB1_R1-gatc.counts.txt.gz
## 7213865
## RPE_CRISPRa_ABCB4_LaminB1_R1-gatc.counts.txt.gz_Dam
## 7213865
## RPE_CRISPRa_ABCB4_LaminB1_R2-gatc.counts.txt.gz
## 7230495
## RPE_CRISPRa_ABCB4_LaminB1_R2-gatc.counts.txt.gz_Dam
## 7230495
## RPE_CRISPRa_combo_LaminB1_R1-gatc.counts.txt.gz
## 7109704
## RPE_CRISPRa_combo_LaminB1_R1-gatc.counts.txt.gz_Dam
## 7109704
## RPE_CRISPRa_combo_LaminB1_R2-gatc.counts.txt.gz
## 10342949
## RPE_CRISPRa_combo_LaminB1_R2-gatc.counts.txt.gz_Dam
## 10342949
## RPE_CRISPRa_RUNCD3B_LaminB1_R1-gatc.counts.txt.gz
## 6974508
## RPE_CRISPRa_RUNCD3B_LaminB1_R1-gatc.counts.txt.gz_Dam
## 6974508
## RPE_CRISPRa_RUNCD3B_LaminB1_R2-gatc.counts.txt.gz
## 7428567
## RPE_CRISPRa_RUNCD3B_LaminB1_R2-gatc.counts.txt.gz_Dam
## 7428567
## RPE_CRISPRa_WT_LaminB1_R1-gatc.counts.txt.gz
## 5722620
## RPE_CRISPRa_WT_LaminB1_R1-gatc.counts.txt.gz_Dam
## 5722620
## RPE_CRISPRa_WT_LaminB1_R2-gatc.counts.txt.gz
## 6491458
## RPE_CRISPRa_WT_LaminB1_R2-gatc.counts.txt.gz_Dam
## 6491458
## RPE_dPC3_TXR3_LaminB2_R1-gatc.counts.txt.gz
## 4846081
## RPE_dPC3_TXR3_LaminB2_R1-gatc.counts.txt.gz_Dam
## 5020601
## RPE_dPC3_TXR3_LaminB2_R2-gatc.counts.txt.gz
## 4753605
## RPE_dPC3_TXR3_LaminB2_R2-gatc.counts.txt.gz_Dam
## 5696142
## RPE_dPC3_TXR4_LaminB2_R1-gatc.counts.txt.gz
## 5017152
## RPE_dPC3_TXR4_LaminB2_R1-gatc.counts.txt.gz_Dam
## 6023804
## RPE_dPC3_TXR4_LaminB2_R2-gatc.counts.txt.gz
## 4339456
## RPE_dPC3_TXR4_LaminB2_R2-gatc.counts.txt.gz_Dam
## 5869564
## RPE_dPC3_WT_LaminB1_R3-gatc.counts.txt.gz
## 6199078
## RPE_dPC3_WT_LaminB1_R3-gatc.counts.txt.gz_Dam
## 6199078
## RPE_dPC3_WT_LaminB1_R4-gatc.counts.txt.gz
## 6490134
## RPE_dPC3_WT_LaminB1_R4-gatc.counts.txt.gz_Dam
## 6490134
## RPE_dPC3_WT_LaminB2_R1-gatc.counts.txt.gz
## 6208211
## RPE_dPC3_WT_LaminB2_R1-gatc.counts.txt.gz_Dam
## 6837316
## RPE_dPC3_WT_LaminB2_R2-gatc.counts.txt.gz
## 4465616
## RPE_dPC3_WT_LaminB2_R2-gatc.counts.txt.gz_Dam
## 6696393
## RPE_dPC3Mi_TxR_s20_LaminB1_R1-gatc.counts.txt.gz
## 6522273
## RPE_dPC3Mi_TxR_s20_LaminB1_R1-gatc.counts.txt.gz_Dam
## 6522273
## RPE_dPC3Mi_TxR_s20_LaminB1_R2-gatc.counts.txt.gz
## 6066689
## RPE_dPC3Mi_TxR_s20_LaminB1_R2-gatc.counts.txt.gz_Dam
## 6066689
## RPE_iCut_5AZA_62_5_LaminB2_R1-gatc.counts.txt.gz
## 5371421
## RPE_iCut_5AZA_62_5_LaminB2_R1-gatc.counts.txt.gz_Dam
## 4856657
## RPE_iCut_5AZA_62_5_LaminB2_R2-gatc.counts.txt.gz
## 5869513
## RPE_iCut_5AZA_62_5_LaminB2_R2-gatc.counts.txt.gz_Dam
## 6443064
## RPE_iCut_ABCB1_cr6_10h_LaminB2_R1-gatc.counts.txt.gz
## 2664729
## RPE_iCut_ABCB1_cr6_10h_LaminB2_R1-gatc.counts.txt.gz_Dam
## 4582596
## RPE_iCut_ABCB1_cr6_10h_LaminB2_R2-gatc.counts.txt.gz
## 9355652
## RPE_iCut_ABCB1_cr6_10h_LaminB2_R2-gatc.counts.txt.gz_Dam
## 12557773
## RPE_iCut_ABCB1_cr6_10h_LaminB2_R3-gatc.counts.txt.gz
## 7421436
## RPE_iCut_ABCB1_cr6_10h_LaminB2_R3-gatc.counts.txt.gz_Dam
## 8532793
## RPE_iCut_ABCB1_cr6_10h_LaminB2_R4-gatc.counts.txt.gz
## 8817767
## RPE_iCut_ABCB1_cr6_10h_LaminB2_R4-gatc.counts.txt.gz_Dam
## 5515521
## RPE_iCut_ABCB1_cr6_24h_LaminB2_R1-gatc.counts.txt.gz
## 2158312
## RPE_iCut_ABCB1_cr6_24h_LaminB2_R1-gatc.counts.txt.gz_Dam
## 5919371
## RPE_iCut_ABCB1_cr6_24h_LaminB2_R2-gatc.counts.txt.gz
## 12254511
## RPE_iCut_ABCB1_cr6_24h_LaminB2_R2-gatc.counts.txt.gz_Dam
## 12260142
## RPE_iCut_ABCB1_cr6_ATMi_10h_LaminB2_R1-gatc.counts.txt.gz
## 4493792
## RPE_iCut_ABCB1_cr6_ATMi_10h_LaminB2_R1-gatc.counts.txt.gz_Dam
## 8370230
## RPE_iCut_ABCB1_cr6_ATMi_10h_LaminB2_R2-gatc.counts.txt.gz
## 5340283
## RPE_iCut_ABCB1_cr6_ATMi_10h_LaminB2_R2-gatc.counts.txt.gz_Dam
## 7517688
## RPE_iCut_ABCB1_cr6_ATRi_10h_LaminB2_R1-gatc.counts.txt.gz
## 4451419
## RPE_iCut_ABCB1_cr6_ATRi_10h_LaminB2_R1-gatc.counts.txt.gz_Dam
## 5817951
## RPE_iCut_ABCB1_cr6_ATRi_10h_LaminB2_R2-gatc.counts.txt.gz
## 4072549
## RPE_iCut_ABCB1_cr6_ATRi_10h_LaminB2_R2-gatc.counts.txt.gz_Dam
## 8075793
## RPE_iCut_ABCB1_cr6_CK666_10h_LaminB2_R1-gatc.counts.txt.gz
## 2309130
## RPE_iCut_ABCB1_cr6_CK666_10h_LaminB2_R1-gatc.counts.txt.gz_Dam
## 4043287
## RPE_iCut_ABCB1_cr6_CK666_10h_LaminB2_R2-gatc.counts.txt.gz
## 2605713
## RPE_iCut_ABCB1_cr6_CK666_10h_LaminB2_R2-gatc.counts.txt.gz_Dam
## 3556261
## RPE_iCut_ABCB1_cr6_DRB_10h_LaminB2_R1-gatc.counts.txt.gz
## 4572423
## RPE_iCut_ABCB1_cr6_DRB_10h_LaminB2_R1-gatc.counts.txt.gz_Dam
## 7281427
## RPE_iCut_ABCB1_cr6_DRB_10h_LaminB2_R2-gatc.counts.txt.gz
## 2893637
## RPE_iCut_ABCB1_cr6_DRB_10h_LaminB2_R2-gatc.counts.txt.gz_Dam
## 3978089
## RPE_iCut_ABCB1_cr6_M3814_10h_LaminB2_R1-gatc.counts.txt.gz
## 4430998
## RPE_iCut_ABCB1_cr6_M3814_10h_LaminB2_R1-gatc.counts.txt.gz_Dam
## 8999909
## RPE_iCut_ABCB1_cr6_M3814_10h_LaminB2_R2-gatc.counts.txt.gz
## 2806431
## RPE_iCut_ABCB1_cr6_M3814_10h_LaminB2_R2-gatc.counts.txt.gz_Dam
## 4361892
## RPE_iCut_ABCB1_cr6_Olaparib_10h_LaminB2_R1-gatc.counts.txt.gz
## 4298643
## RPE_iCut_ABCB1_cr6_Olaparib_10h_LaminB2_R1-gatc.counts.txt.gz_Dam
## 4303820
## RPE_iCut_ABCB1_cr6_Olaparib_10h_LaminB2_R2-gatc.counts.txt.gz
## 4322053
## RPE_iCut_ABCB1_cr6_Olaparib_10h_LaminB2_R2-gatc.counts.txt.gz_Dam
## 3647708
## RPE_iCut_ABCB1_cr6_short_10h_LaminB2_R1-gatc.counts.txt.gz
## 2779896
## RPE_iCut_ABCB1_cr6_short_10h_LaminB2_R1-gatc.counts.txt.gz_Dam
## 2945057
## RPE_iCut_ABCB1_cr6_short_10h_LaminB2_R2-gatc.counts.txt.gz
## 8585892
## RPE_iCut_ABCB1_cr6_short_10h_LaminB2_R2-gatc.counts.txt.gz_Dam
## 7505476
## RPE_iCut_ADAM22_cr3_10h_LaminB2_R1-gatc.counts.txt.gz
## 2242930
## RPE_iCut_ADAM22_cr3_10h_LaminB2_R1-gatc.counts.txt.gz_Dam
## 4981281
## RPE_iCut_ADAM22_cr3_10h_LaminB2_R2-gatc.counts.txt.gz
## 9532467
## RPE_iCut_ADAM22_cr3_10h_LaminB2_R2-gatc.counts.txt.gz_Dam
## 8548987
## RPE_iCut_ADAM22_cr3_24h_LaminB2_R1-gatc.counts.txt.gz
## 1901805
## RPE_iCut_ADAM22_cr3_24h_LaminB2_R1-gatc.counts.txt.gz_Dam
## 4098346
## RPE_iCut_ADAM22_cr3_24h_LaminB2_R2-gatc.counts.txt.gz
## 8849699
## RPE_iCut_ADAM22_cr3_24h_LaminB2_R2-gatc.counts.txt.gz_Dam
## 4894243
## RPE_iCut_ANCHOR3_clone7_DMSO_LaminB2_R1-gatc.counts.txt.gz
## 4191516
## RPE_iCut_ANCHOR3_clone7_DMSO_LaminB2_R1-gatc.counts.txt.gz_Dam
## 7060830
## RPE_iCut_ANCHOR3_clone7_DMSO_LaminB2_R2-gatc.counts.txt.gz
## 5958699
## RPE_iCut_ANCHOR3_clone7_DMSO_LaminB2_R2-gatc.counts.txt.gz_Dam
## 7537442
## RPE_iCut_ANCHOR3_clone7_TXR_LaminB2_R1-gatc.counts.txt.gz
## 5246872
## RPE_iCut_ANCHOR3_clone7_TXR_LaminB2_R1-gatc.counts.txt.gz_Dam
## 7984126
## RPE_iCut_ANCHOR3_clone7_TXR_LaminB2_R2-gatc.counts.txt.gz
## 6170806
## RPE_iCut_ANCHOR3_clone7_TXR_LaminB2_R2-gatc.counts.txt.gz_Dam
## 7825084
## RPE_iCut_ANCHOR3_polyclonal_LaminB2_R1-gatc.counts.txt.gz
## 7364115
## RPE_iCut_ANCHOR3_polyclonal_LaminB2_R1-gatc.counts.txt.gz_Dam
## 5998033
## RPE_iCut_ANCHOR3_polyclonal_LaminB2_R2-gatc.counts.txt.gz
## 8677098
## RPE_iCut_ANCHOR3_polyclonal_LaminB2_R2-gatc.counts.txt.gz_Dam
## 4743951
## RPE_iCut_BIX_2_LaminB2_R1-gatc.counts.txt.gz
## 7001508
## RPE_iCut_BIX_2_LaminB2_R1-gatc.counts.txt.gz_Dam
## 8885964
## RPE_iCut_BIX_2_LaminB2_R2-gatc.counts.txt.gz
## 4889914
## RPE_iCut_BIX_2_LaminB2_R2-gatc.counts.txt.gz_Dam
## 6486412
## RPE_iCut_BIX_2_LaminB2_R3-gatc.counts.txt.gz
## 8118928
## RPE_iCut_BIX_2_LaminB2_R3-gatc.counts.txt.gz_Dam
## 8106445
## RPE_iCut_DMSO_LaminB2_R1-gatc.counts.txt.gz
## 9016169
## RPE_iCut_DMSO_LaminB2_R1-gatc.counts.txt.gz_Dam
## 6816124
## RPE_iCut_DMSO_LaminB2_R2-gatc.counts.txt.gz
## 6004829
## RPE_iCut_DMSO_LaminB2_R2-gatc.counts.txt.gz_Dam
## 6600703
## RPE_iCut_GSK126_500_LaminB2_R1-gatc.counts.txt.gz
## 6571922
## RPE_iCut_GSK126_500_LaminB2_R1-gatc.counts.txt.gz_Dam
## 4560595
## RPE_iCut_GSK126_500_LaminB2_R2-gatc.counts.txt.gz
## 15686330
## RPE_iCut_GSK126_500_LaminB2_R2-gatc.counts.txt.gz_Dam
## 6834085
## RPE_iCut_LBRKO_LaminB1_R3-gatc.counts.txt.gz
## 5837763
## RPE_iCut_LBRKO_LaminB1_R3-gatc.counts.txt.gz_Dam
## 5837763
## RPE_iCut_LBRKO_LaminB1_R4-gatc.counts.txt.gz
## 6340729
## RPE_iCut_LBRKO_LaminB1_R4-gatc.counts.txt.gz_Dam
## 6340729
## RPE_iCut_LBRKO_LaminB2_R1-gatc.counts.txt.gz
## 6228819
## RPE_iCut_LBRKO_LaminB2_R1-gatc.counts.txt.gz_Dam
## 13205669
## RPE_iCut_LBRKO_LaminB2_R2-gatc.counts.txt.gz
## 7378437
## RPE_iCut_LBRKO_LaminB2_R2-gatc.counts.txt.gz_Dam
## 8854090
## RPE_iCut_LMNAKO_LaminB2_R1-gatc.counts.txt.gz
## 10624186
## RPE_iCut_LMNAKO_LaminB2_R1-gatc.counts.txt.gz_Dam
## 5262561
## RPE_iCut_LMNAKO_LaminB2_R2-gatc.counts.txt.gz
## 5842671
## RPE_iCut_LMNAKO_LaminB2_R2-gatc.counts.txt.gz_Dam
## 5978557
## RPE_iCut_LMNB1KO_LaminB2_R1-gatc.counts.txt.gz
## 5481121
## RPE_iCut_LMNB1KO_LaminB2_R1-gatc.counts.txt.gz_Dam
## 6539260
## RPE_iCut_LMNB1KO_LaminB2_R2-gatc.counts.txt.gz
## 5141371
## RPE_iCut_LMNB1KO_LaminB2_R2-gatc.counts.txt.gz_Dam
## 5367725
## RPE_iCut_pool9_10h_ATMi_LaminB2_R1-gatc.counts.txt.gz
## 6852087
## RPE_iCut_pool9_10h_ATMi_LaminB2_R1-gatc.counts.txt.gz_Dam
## 8035263
## RPE_iCut_pool9_10h_ATMi_LaminB2_R2-gatc.counts.txt.gz
## 6365723
## RPE_iCut_pool9_10h_ATMi_LaminB2_R2-gatc.counts.txt.gz_Dam
## 6804968
## RPE_iCut_pool9_10h_DNAPKi_LaminB2_R1-gatc.counts.txt.gz
## 5011125
## RPE_iCut_pool9_10h_DNAPKi_LaminB2_R1-gatc.counts.txt.gz_Dam
## 8622881
## RPE_iCut_pool9_10h_DNAPKi_LaminB2_R2-gatc.counts.txt.gz
## 5774369
## RPE_iCut_pool9_10h_DNAPKi_LaminB2_R2-gatc.counts.txt.gz_Dam
## 6553886
## RPE_iCut_pool9_10h_LaminB2_R1-gatc.counts.txt.gz
## 4549861
## RPE_iCut_pool9_10h_LaminB2_R1-gatc.counts.txt.gz_Dam
## 5333752
## RPE_iCut_pool9_10h_LaminB2_R2-gatc.counts.txt.gz
## 6666603
## RPE_iCut_pool9_10h_LaminB2_R2-gatc.counts.txt.gz_Dam
## 6489322
## RPE_iCut_pool9_10h_LaminB2_R3-gatc.counts.txt.gz
## 5611353
## RPE_iCut_pool9_10h_LaminB2_R3-gatc.counts.txt.gz_Dam
## 5893908
## RPE_iCut_pool9_10h_LaminB2_R4-gatc.counts.txt.gz
## 6800960
## RPE_iCut_pool9_10h_LaminB2_R4-gatc.counts.txt.gz_Dam
## 8063952
## RPE_iCut_pool9_24h_LaminB2_R1-gatc.counts.txt.gz
## 5463941
## RPE_iCut_pool9_24h_LaminB2_R1-gatc.counts.txt.gz_Dam
## 7206086
## RPE_iCut_pool9_24h_LaminB2_R2-gatc.counts.txt.gz
## 5487181
## RPE_iCut_pool9_24h_LaminB2_R2-gatc.counts.txt.gz_Dam
## 7821692
## RPE_iCut_pool9tracr_10h_LaminB2_R1-gatc.counts.txt.gz
## 7021423
## RPE_iCut_pool9tracr_10h_LaminB2_R1-gatc.counts.txt.gz_Dam
## 11729929
## RPE_iCut_pool9tracr_10h_LaminB2_R2-gatc.counts.txt.gz
## 5427104
## RPE_iCut_pool9tracr_10h_LaminB2_R2-gatc.counts.txt.gz_Dam
## 11232355
## RPE_iCut_tracr_10h_LaminB2_R1-gatc.counts.txt.gz
## 2441927
## RPE_iCut_tracr_10h_LaminB2_R1-gatc.counts.txt.gz_Dam
## 5098431
## RPE_iCut_tracr_10h_LaminB2_R2-gatc.counts.txt.gz
## 10822728
## RPE_iCut_tracr_10h_LaminB2_R2-gatc.counts.txt.gz_Dam
## 9271605
## RPE_iCut_tracr_10h_LaminB2_R3-gatc.counts.txt.gz
## 8012049
## RPE_iCut_tracr_10h_LaminB2_R3-gatc.counts.txt.gz_Dam
## 5815176
## RPE_iCut_tracr_10h_LaminB2_R4-gatc.counts.txt.gz
## 8760328
## RPE_iCut_tracr_10h_LaminB2_R4-gatc.counts.txt.gz_Dam
## 6369574
## RPE_iCut_TxR_cloneF6_1_LaminB2_R1-gatc.counts.txt.gz
## 8056821
## RPE_iCut_TxR_cloneF6_1_LaminB2_R1-gatc.counts.txt.gz_Dam
## 4855944
## RPE_iCut_TxR_cloneF6_1_LaminB2_R2-gatc.counts.txt.gz
## 5153990
## RPE_iCut_TxR_cloneF6_1_LaminB2_R2-gatc.counts.txt.gz_Dam
## 7015188
## RPE_iCut_TxR_cloneG6_3_LaminB2_R1-gatc.counts.txt.gz
## 7532241
## RPE_iCut_TxR_cloneG6_3_LaminB2_R1-gatc.counts.txt.gz_Dam
## 6366250
## RPE_iCut_TxR_cloneG6_3_LaminB2_R2-gatc.counts.txt.gz
## 7788911
## RPE_iCut_TxR_cloneG6_3_LaminB2_R2-gatc.counts.txt.gz_Dam
## 6153409
## RPE_iCut_WT_LaminB2_R1-gatc.counts.txt.gz
## 6522158
## RPE_iCut_WT_LaminB2_R1-gatc.counts.txt.gz_Dam
## 8772753
## RPE_iCut_WT_LaminB2_R2-gatc.counts.txt.gz
## 3125946
## RPE_iCut_WT_LaminB2_R2-gatc.counts.txt.gz_Dam
## 6382989
## RPE_iCut_WT_LaminB2_R3-gatc.counts.txt.gz
## 4405577
## RPE_iCut_WT_LaminB2_R3-gatc.counts.txt.gz_Dam
## 6483707
## RPE_iCut_WT_LaminB2_R4-gatc.counts.txt.gz
## 7454359
## RPE_iCut_WT_LaminB2_R4-gatc.counts.txt.gz_Dam
## 8740536
# Also combined the counts (Dam + lamina) and have them separate
genes_Tom.count.combined<- genes_Tom.count.lamina <- genes_Tom.count.dam<- genes_Tom.count
mcols(genes_Tom.count.combined) <- (as(mcols(genes_Tom.count)[, seq(1, 2*n, 2)], "data.frame")+
as(mcols(genes_Tom.count)[, seq(2, 2*n, 2)], "data.frame"))
mcols(genes_Tom.count.lamina) <-mcols(genes_Tom.count)[,seq(1, 2*n,2)]
mcols(genes_Tom.count.dam) <-mcols(genes_Tom.count)[,seq(2, 2*n,2)]
#Normalize to 1 M reads
norm_factor <- library_size/ 1e6
genes_Tom.count.norm<- genes_Tom.count
mcols(genes_Tom.count.norm)<- t(t(as(mcols(genes_Tom.count), "data.frame"))/ norm_factor)
#Also create a 1M mean counts of Dam and Lamina per experiment
genes_Tom.count.norm.combined<- genes_Tom.count.norm
mcols(genes_Tom.count.norm.combined) <- (as(mcols(genes_Tom.count.norm) [,seq(1, 2*n, 2)], "data.frame")+
as(mcols(genes_Tom.count.norm) [,seq(2, 2*n, 2)], "data.frame")) /2
# Normalize Lamina over Dam function (ratio2)
NormalizeDamID <- function(df, pseudo = 1) {
# This function expects a data frame that is composed of lamina and Dam-only
# columns, in that order. It simply determines the log2 ratio of the two for
# every pair of columns. A pseudocount is added to prevent problems.
n <- ncol(df)
df.norm <- log2((df[, seq(1, n-1, 2)] + pseudo) / (df[, seq(2, n, 2)] + pseudo))
df.norm
}
#Normalize LB over Dam (ratio2)
genes_Tom.norm <- genes_Tom.count
mcols(genes_Tom.norm) <- NormalizeDamID(as(mcols(genes_Tom.count.norm), "data.frame"))
mcols(genes_Tom.norm) <- mcols(genes_Tom.norm)[, samples_Tom$sample]
# Plot the distributions for the various samples_Tom This does not work
#plot(1, 1, type = "n", xlim = c(-6, 4), ylim = c(0, 0.6),
# xlab = "Dam-ratio (log2)", ylab = "density", main = "Density Dam ratio for genes_Tom")
#for (i in 1:n) {
# lines(density(mcols(genes_Tom.norm)[, i],na.rm = TRUE,), col = samples_Tom, lwd = 2,
# lty = as.numeric(samples_Tom.df$integration[i]),)
#}
#legend("topright", legend = levels(samples_Tom),
# pch = 19, col = 1:length(levels(samples_Tom)))
#Scale the data to the same mean and stdev
genes_Tom.norm.scaled<-genes_Tom.norm
mcols(genes_Tom.norm.scaled)<- scale(as(mcols(genes_Tom.norm.scaled), "data.frame"))
# Save files as RDS
saveRDS(samples_Tom, file.path(output_dir, "samples_Tom.rds"))
saveRDS(genes_Tom.norm.scaled, file.path(output_dir, "genes_Tom_norm_filtered.rds"))
# First, change the mcols of the genes_Tom
mcols(genes_Tom) <- mcols(genes_Tom)[, c("gene_id", "gene_name", "GATC_fragments")]
saveRDS(genes_Tom, file.path(output_dir, "genes_Tom_filtered.rds"))
# samples_Tom data frame - basically metadata
padamid.samples_Tom<- readRDS(file.path(output_dir,
"samples_Tom.rds"))
# Filtered genes_Tom used in the pA-DamID analysis
padamid.genes_Tom <- readRDS(file.path(output_dir,
"genes_Tom_filtered.rds"))
# Data frame with per-gene information on differential analysis
#padamid.results <- readRDS(file.path(output_dir,
# "Differential_score.rds"))
padamid.norm <- readRDS(file.path(output_dir,
"genes_Tom_norm_filtered.rds"))
names(mcols(padamid.norm))<-padamid.samples_Tom$samples_Tom
class(padamid.norm)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
#class(padamid.results)
class(padamid.genes_Tom)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
class(padamid.samples_Tom)
## [1] "data.frame"
padamid_norm_df<- as(mcols(padamid.norm), "data.frame")
padamid_genes_Tom_df<- as(mcols(padamid.genes_Tom), "data.frame")
#binding gene name and danID score
damid_score_gene_Tom<-cbind(padamid_genes_Tom_df, padamid_norm_df)
Now let’s do the same for LMNB1 antibody.
#Use samples from a previous document
#samples <-list.files(path = "/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/results/counts/bin-gatc", pattern = "Lmnb2")
samples_Tom2 <-list.files(path = "/DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/counts/bin-gatc", pattern="LaminB1")
samples_Tom2 <-as.data.frame(samples_Tom2)
#samples_Tom$sample <- gsub("-10kb", "-gatc", samples_Tom$sample)
samples_Tom2.lmnb1<- samples_Tom2$sample
n<- nrow(samples_Tom2)
#Prepare output files
df.count_Tom2.file<- file.path(output_dir, "df_count_Tom2.rds")
genes_Tom2.file<- file.path(output_dir, "genes_Tom2.rds")
gene_Tom2.counts.file<- file.path(output_dir, "genes_Tom2_counts.rds")
### 1) Count reads in genes_Tom2
# ts220203 - improve Stefano's script
# New code. Changes:
# - only read data if output .rds file does not exist
# - change to read.table to read_tsv, which should be faster
# 1) Read count data frame
library(tidyverse)
read_sample_and_dam <- function(x,
input_dir = "/DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/counts/bin-gatc") {
# read sample and dam separately
sample <- read_tsv(file.path(input_dir,
x),
show_col_types = F,
col_names = c("seqnames",
"start",
"end",
x),
col_types = c(seqnames = "-",
start = "-",
end = "-"))
dam <- read_tsv(file.path(input_dir,
gsub("LaminB1", "Dam", x)),
show_col_types = F,
col_names = c("seqnames",
"start",
"end",
paste0(x, "_Dam")),
col_types = c(seqnames = "-",
start = "-",
end = "-"))
# combine
sample <- bind_cols(sample, dam)
# and return
sample
}
if (! file.exists(df.count_Tom2.file)) {
# first, load bins
# add +1 to the start, as this was bed-based
bins <- read_tsv(file.path(input_dir,
samples_Tom2.lmnb1[1]),
col_names = c("seqnames",
"start",
"end")) %>%
dplyr::select(1:3) %>%
mutate(start = start + 1)
# load counts
tib_count <- purrr::map(samples_Tom2.lmnb1,
read_sample_and_dam) %>%
purrr::reduce(bind_cols)
# combine the two
tib_count <- bind_cols(bins, tib_count)
# convert to GRanges and set seqlevels
df.count_Tom2 <- as(tib_count, "GRanges")
# Set seqlevels
seqlevels(df.count_Tom2, pruning = "coarse") <- c(paste0("chr", 1:22), "chrX")
seqlengths(df.count_Tom2) <- chrom_sizes[seqlevels(df.count_Tom2), "length"]
saveRDS(df.count_Tom2, df.count_Tom2.file)
} else {
df.count_Tom2 <- readRDS(df.count_Tom2.file)
}
/DATA/scratch/usr/t.v.schaik/proj/sManzo_pADamID/ts210903_samples_Anna_pADamID_RPE/results/counts/bin-20kb/A549_61_LaminB2_R1-20kb.counts.txt.gz
#I need the gene locations to count the data
genes_Tom2<- import("/DATA/scratch/usr/t.v.schaik/data/gene_builds/GRCh38/gencode.v24.primary_assembly.annotation.gtf")
genes_Tom2 <- genes_Tom2[genes_Tom2$type=="gene"]
#Filter for protein coding genes_Tom2
genes_Tom2<-genes_Tom2[genes_Tom2$gene_type == "protein_coding"]
#filter for chromosomes
genes_Tom2 <- genes_Tom2[seqnames(genes_Tom2) %in% c(paste0("chr", 1:22), "chrX")]
#Set seqlevels
seqlevels(genes_Tom2, pruning = "coarse") <- c(paste0("chr", 1:22),"chrX")
seqlengths(genes_Tom2)<- chrom_sizes[seqlevels(genes_Tom2), "length"]
#First extend the genes_Tom2
genes_Tom2.query <- genes_Tom2
mcols(genes_Tom2.query)<- NULL
start(genes_Tom2.query)<- start(genes_Tom2.query) - ext
end(genes_Tom2.query)<- end(genes_Tom2.query) + ext
#Overlap genes_Tom2 with the counts
ovl<-findOverlaps(genes_Tom2.query, df.count_Tom2)
#Get the number of GATC fragment for every gene query
mcols(genes_Tom2) [,"GATC_fragments"]<- 0
t <- table(queryHits(ovl))
mcols(genes_Tom2)[as.integer(names(t)), "GATC_fragments"] <- as.numeric(t)
#Get the summed signal for every gene
gene_Tom2.counts<- genes_Tom2
mcols(gene_Tom2.counts) <-NULL
#mcols(gene.counts)[, names(mcols(df.count_Tom2))]<-NA
# Faster tibble/dtplyr implementation
tib <- lazy_dt(as.tibble(ovl)) %>% #as_tibble()
rename_all(~ c("idx_genes_Tom2", "idx_gatc")) %>%
as_tibble()
tib <- bind_cols(tib,
as.tibble(mcols(df.count_Tom2[tib$idx_gatc]))) #as_tibble()
tib_summarise <- tib %>%
gather(key, value, -contains("idx"))
tib_summarise <- lazy_dt(tib_summarise) %>%
group_by(idx_genes_Tom2, key) %>%
summarise(count = sum(value)) %>%
ungroup() %>%
mutate(key = factor(key, levels = str_replace_all(names(mcols(df.count_Tom2)), "-", "."))) %>%
as.tibble() #as_tibble()
## `summarise()` has grouped output by 'idx_genes_Tom2'. You can override using the
## `.groups` argument.
tib_genes_Tom2 <- tib_summarise %>%
spread(key, count)
mcols(gene_Tom2.counts) <- tib_genes_Tom2 %>%
dplyr::select(-idx_genes_Tom2)
names(mcols(gene_Tom2.counts)) <- names(mcols(df.count_Tom2))
saveRDS(genes_Tom2, genes_Tom2.file)
saveRDS(gene_Tom2.counts, gene_Tom2.counts.file)
df.count_Tom2.file<- file.path(output_dir, "df_count_Tom2.rds")
genes_Tom2.file<- file.path(output_dir, "genes_Tom2.rds")
gene_Tom2.counts.file<- file.path(output_dir, "genes_Tom2_counts.rds")
df.counts_Tom2<-readRDS(df.count_Tom2.file)
genes_Tom2<- readRDS(genes_Tom2.file)
genes_Tom2.count<- readRDS(gene_Tom2.counts.file)
Next, we would like to normalize the gene counts for various parameters:
Absolute read filtering
# To-do: filter genes_Tom2 for having too few reads?
# alternatively, filter for number of GATC fragments overlapping a gene?
idx<- rowSums(as(mcols(genes_Tom2.count), "data.frame") >=10) >=2
genes_Tom2.count <- genes_Tom2.count[idx]
genes_Tom2 <- genes_Tom2[idx]
The library size
library_size <- colSums(as(mcols(df.counts_Tom2), "data.frame"))
library_size
## RPE_CRISPRa_ABCB1_LaminB1_R1-gatc.counts.txt.gz
## 6648108
## RPE_CRISPRa_ABCB1_LaminB1_R1-gatc.counts.txt.gz_Dam
## 7122095
## RPE_CRISPRa_ABCB1_LaminB1_R2-gatc.counts.txt.gz
## 6901312
## RPE_CRISPRa_ABCB1_LaminB1_R2-gatc.counts.txt.gz_Dam
## 5807849
## RPE_CRISPRa_ABCB4_LaminB1_R1-gatc.counts.txt.gz
## 7213865
## RPE_CRISPRa_ABCB4_LaminB1_R1-gatc.counts.txt.gz_Dam
## 8055325
## RPE_CRISPRa_ABCB4_LaminB1_R2-gatc.counts.txt.gz
## 7230495
## RPE_CRISPRa_ABCB4_LaminB1_R2-gatc.counts.txt.gz_Dam
## 7386565
## RPE_CRISPRa_combo_LaminB1_R1-gatc.counts.txt.gz
## 7109704
## RPE_CRISPRa_combo_LaminB1_R1-gatc.counts.txt.gz_Dam
## 7596199
## RPE_CRISPRa_combo_LaminB1_R2-gatc.counts.txt.gz
## 10342949
## RPE_CRISPRa_combo_LaminB1_R2-gatc.counts.txt.gz_Dam
## 7489433
## RPE_CRISPRa_RUNCD3B_LaminB1_R1-gatc.counts.txt.gz
## 6974508
## RPE_CRISPRa_RUNCD3B_LaminB1_R1-gatc.counts.txt.gz_Dam
## 7375382
## RPE_CRISPRa_RUNCD3B_LaminB1_R2-gatc.counts.txt.gz
## 7428567
## RPE_CRISPRa_RUNCD3B_LaminB1_R2-gatc.counts.txt.gz_Dam
## 7643385
## RPE_CRISPRa_WT_LaminB1_R1-gatc.counts.txt.gz
## 5722620
## RPE_CRISPRa_WT_LaminB1_R1-gatc.counts.txt.gz_Dam
## 5329915
## RPE_CRISPRa_WT_LaminB1_R2-gatc.counts.txt.gz
## 6491458
## RPE_CRISPRa_WT_LaminB1_R2-gatc.counts.txt.gz_Dam
## 5724573
## RPE_dPC3_WT_LaminB1_R3-gatc.counts.txt.gz
## 6199078
## RPE_dPC3_WT_LaminB1_R3-gatc.counts.txt.gz_Dam
## 5864288
## RPE_dPC3_WT_LaminB1_R4-gatc.counts.txt.gz
## 6490134
## RPE_dPC3_WT_LaminB1_R4-gatc.counts.txt.gz_Dam
## 6358818
## RPE_dPC3Mi_TxR_s20_LaminB1_R1-gatc.counts.txt.gz
## 6522273
## RPE_dPC3Mi_TxR_s20_LaminB1_R1-gatc.counts.txt.gz_Dam
## 7008463
## RPE_dPC3Mi_TxR_s20_LaminB1_R2-gatc.counts.txt.gz
## 6066689
## RPE_dPC3Mi_TxR_s20_LaminB1_R2-gatc.counts.txt.gz_Dam
## 7422449
## RPE_iCut_LBRKO_LaminB1_R3-gatc.counts.txt.gz
## 5837763
## RPE_iCut_LBRKO_LaminB1_R3-gatc.counts.txt.gz_Dam
## 7289557
## RPE_iCut_LBRKO_LaminB1_R4-gatc.counts.txt.gz
## 6340729
## RPE_iCut_LBRKO_LaminB1_R4-gatc.counts.txt.gz_Dam
## 4475611
# Also combined the counts (Dam + lamina) and have them separate
genes_Tom2.count.combined<- genes_Tom2.count.lamina <- genes_Tom2.count.dam<- genes_Tom2.count
mcols(genes_Tom2.count.combined) <- (as(mcols(genes_Tom2.count)[, seq(1, 2*n, 2)], "data.frame")+
as(mcols(genes_Tom2.count)[, seq(2, 2*n, 2)], "data.frame"))
mcols(genes_Tom2.count.lamina) <-mcols(genes_Tom2.count)[,seq(1, 2*n,2)]
mcols(genes_Tom2.count.dam) <-mcols(genes_Tom2.count)[,seq(2, 2*n,2)]
#Normalize to 1 M reads
norm_factor <- library_size/ 1e6
genes_Tom2.count.norm<- genes_Tom2.count
mcols(genes_Tom2.count.norm)<- t(t(as(mcols(genes_Tom2.count), "data.frame"))/ norm_factor)
#Also create a 1M mean counts of Dam and Lamina per experiment
genes_Tom2.count.norm.combined<- genes_Tom2.count.norm
mcols(genes_Tom2.count.norm.combined) <- (as(mcols(genes_Tom2.count.norm) [,seq(1, 2*n, 2)], "data.frame")+
as(mcols(genes_Tom2.count.norm) [,seq(2, 2*n, 2)], "data.frame")) /2
# Normalize Lamina over Dam function (ratio2)
NormalizeDamID <- function(df, pseudo = 1) {
# This function expects a data frame that is composed of lamina and Dam-only
# columns, in that order. It simply determines the log2 ratio of the two for
# every pair of columns. A pseudocount is added to prevent problems.
n <- ncol(df)
df.norm <- log2((df[, seq(1, n-1, 2)] + pseudo) / (df[, seq(2, n, 2)] + pseudo))
df.norm
}
#Normalize LB over Dam (ratio2)
genes_Tom2.norm <- genes_Tom2.count
mcols(genes_Tom2.norm) <- NormalizeDamID(as(mcols(genes_Tom2.count.norm), "data.frame"))
mcols(genes_Tom2.norm) <- mcols(genes_Tom2.norm)[, samples_Tom2$sample]
# Plot the distributions for the various samples_Tom2 This does not work
#plot(1, 1, type = "n", xlim = c(-6, 4), ylim = c(0, 0.6),
# xlab = "Dam-ratio (log2)", ylab = "density", main = "Density Dam ratio for genes_Tom2")
#for (i in 1:n) {
# lines(density(mcols(genes_Tom2.norm)[, i],na.rm = TRUE,), col = samples_Tom2, lwd = 2,
# lty = as.numeric(samples_Tom2.df$integration[i]),)
#}
#legend("topright", legend = levels(samples_Tom2),
# pch = 19, col = 1:length(levels(samples_Tom2)))
#Scale the data to the same mean and stdev
genes_Tom2.norm.scaled<-genes_Tom2.norm
mcols(genes_Tom2.norm.scaled)<- scale(as(mcols(genes_Tom2.norm.scaled), "data.frame"))
# Save files as RDS
saveRDS(samples_Tom2, file.path(output_dir, "samples_Tom2.rds"))
saveRDS(genes_Tom2.norm.scaled, file.path(output_dir, "genes_Tom2_norm_filtered.rds"))
# First, change the mcols of the genes_Tom2
mcols(genes_Tom2) <- mcols(genes_Tom2)[, c("gene_id", "gene_name", "GATC_fragments")]
saveRDS(genes_Tom2, file.path(output_dir, "genes_Tom2_filtered.rds"))
# samples_Tom2 data frame - basically metadata
padamid.samples_Tom2<- readRDS(file.path(output_dir,
"samples_Tom2.rds"))
# Filtered genes_Tom2 used in the pA-DamID analysis
padamid.genes_Tom2 <- readRDS(file.path(output_dir,
"genes_Tom2_filtered.rds"))
# Data frame with per-gene information on differential analysis
#padamid.results <- readRDS(file.path(output_dir,
# "Differential_score.rds"))
padamid.norm <- readRDS(file.path(output_dir,
"genes_Tom2_norm_filtered.rds"))
names(mcols(padamid.norm))<-padamid.samples_Tom2$samples_Tom2
class(padamid.norm)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
#class(padamid.results)
class(padamid.genes_Tom2)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
class(padamid.samples_Tom2)
## [1] "data.frame"
padamid_norm_df<- as(mcols(padamid.norm), "data.frame")
padamid_genes_Tom2_df<- as(mcols(padamid.genes_Tom2), "data.frame")
#binding gene name and danID score
damid_score_gene_Tom2<-cbind(padamid_genes_Tom2_df, padamid_norm_df)
damid_score_gene_Tom_LB1_LB2<-inner_join(damid_score_gene_Tom,damid_score_gene_Tom2, by="gene_name")
damid_score_gene_Tom_LB1_LB2<-damid_score_gene_Tom_LB1_LB2[,-(83:84)]
Taxol_damid_score_gene_Tom<-damid_score_gene_Tom_LB1_LB2 %>%
mutate(RPE_dPC3_TXR3_LB2=(`RPE_dPC3_TXR3_LaminB2_R1-gatc.counts.txt.gz`+ `RPE_dPC3_TXR3_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
mutate(RPE_dPC3_TXR4_LB2=(`RPE_dPC3_TXR4_LaminB2_R1-gatc.counts.txt.gz`+ `RPE_dPC3_TXR4_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
mutate(RPE_dPC3_WT_LB2=(`RPE_dPC3_WT_LaminB2_R1-gatc.counts.txt.gz`+ `RPE_dPC3_WT_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
mutate(RPE_dPC3_WT_LB1=(`RPE_dPC3_WT_LaminB1_R3-gatc.counts.txt.gz`+ `RPE_dPC3_WT_LaminB1_R4-gatc.counts.txt.gz`)/2)%>%
mutate(RPE_dPC3Mi_TXR_LB1=(`RPE_dPC3Mi_TxR_s20_LaminB1_R1-gatc.counts.txt.gz`+ `RPE_dPC3Mi_TxR_s20_LaminB1_R2-gatc.counts.txt.gz`)/2)%>%
mutate(RPE_iCut_5AZA_62_5_LB2=(`RPE_iCut_5AZA_62_5_LaminB2_R1-gatc.counts.txt.gz`+ `RPE_iCut_5AZA_62_5_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
mutate(RPE_iCut_ANCHOR3_clone7_DMSO_LB2=(`RPE_iCut_ANCHOR3_clone7_DMSO_LaminB2_R1-gatc.counts.txt.gz`+ `RPE_iCut_ANCHOR3_clone7_DMSO_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
mutate(RPE_iCut_ANCHOR3_clone7_TXR_LB2=(`RPE_iCut_ANCHOR3_clone7_TXR_LaminB2_R1-gatc.counts.txt.gz`+ `RPE_iCut_ANCHOR3_clone7_TXR_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
mutate(RPE_iCut_ANCHOR3_polyclonal_LB2=(`RPE_iCut_ANCHOR3_polyclonal_LaminB2_R1-gatc.counts.txt.gz`+ `RPE_iCut_ANCHOR3_polyclonal_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
mutate(RPE_iCut_BIX_2_LB2=(`RPE_iCut_BIX_2_LaminB2_R1-gatc.counts.txt.gz`+
`RPE_iCut_BIX_2_LaminB2_R3-gatc.counts.txt.gz`)/2)%>%
mutate(RPE_iCut_DMSO_LB2=(`RPE_iCut_DMSO_LaminB2_R1-gatc.counts.txt.gz`+
`RPE_iCut_DMSO_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
mutate(RPE_iCut_GSK126_500_LB2=(`RPE_iCut_GSK126_500_LaminB2_R1-gatc.counts.txt.gz`+ `RPE_iCut_GSK126_500_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
mutate(RPE_iCut_LBRKO_LB2=(`RPE_iCut_LBRKO_LaminB2_R1-gatc.counts.txt.gz`+ `RPE_iCut_LBRKO_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
mutate(RPE_iCut_LMNAKO_LB2=(`RPE_iCut_LMNAKO_LaminB2_R1-gatc.counts.txt.gz`+ `RPE_iCut_LMNAKO_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
mutate(RPE_iCut_LMNB1KO_LB2=(`RPE_iCut_LMNB1KO_LaminB2_R1-gatc.counts.txt.gz`+ `RPE_iCut_LMNB1KO_LaminB2_R2-gatc.counts.txt.gz`)/2)%>%
mutate(RPE_iCut_WT_LB2=(`RPE_iCut_WT_LaminB2_R3-gatc.counts.txt.gz`+ `RPE_iCut_WT_LaminB2_R4-gatc.counts.txt.gz`)/2)
ABCB1<-Taxol_damid_score_gene_Tom%>%filter(gene_name=="ABCB1")
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_BIX_treatment.pdf")
Taxol_damid_score_gene_Tom%>% ggplot(., aes(x= RPE_iCut_DMSO_LB2, y= RPE_iCut_BIX_2_LB2))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("Bix treatment,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE_iCut_DMSO_LB2, y= RPE_iCut_BIX_2_LB2), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)
#dev.off()
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_GSK_treatment.pdf")
Taxol_damid_score_gene_Tom %>% ggplot(., aes(x= RPE_iCut_DMSO_LB2, y= RPE_iCut_GSK126_500_LB2))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("GSK treatment,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE_iCut_DMSO_LB2, y= RPE_iCut_GSK126_500_LB2), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)
#dev.off()
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_AZA_treatment.pdf")
Taxol_damid_score_gene_Tom %>% ggplot(., aes(x= RPE_iCut_DMSO_LB2, y= RPE_iCut_5AZA_62_5_LB2))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("AZA treatment,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE_iCut_DMSO_LB2, y= RPE_iCut_5AZA_62_5_LB2), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)
#dev.off()
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_LBRKO.pdf")
Taxol_damid_score_gene_Tom %>% ggplot(., aes(x= RPE_iCut_WT_LB2, y= RPE_iCut_LBRKO_LB2))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("LBRKO,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE_iCut_WT_LB2, y= RPE_iCut_LBRKO_LB2), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)
#dev.off()
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_LMNAKO.pdf")
Taxol_damid_score_gene_Tom %>% ggplot(., aes(x= RPE_iCut_WT_LB2, y= RPE_iCut_LMNAKO_LB2))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("LMNAKO,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE_iCut_WT_LB2, y= RPE_iCut_LMNAKO_LB2), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)
#dev.off()
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_LMNB1KO.pdf")
Taxol_damid_score_gene_Tom %>% ggplot(., aes(x= RPE_iCut_WT_LB2, y= RPE_iCut_LMNB1KO_LB2))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("LMNB1KO,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE_iCut_WT_LB2, y= RPE_iCut_LMNB1KO_LB2), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)
#dev.off()
Let’s bind old and new data
Taxol_damid_score_gene_merge<-inner_join(Taxol_damid_score_gene_Tom, Taxol_damid_score_gene, by="gene_name")
Let’s focus on RPE20 the first TXR selected clone First scatter lot according to the original control
ABCB1<-Taxol_damid_score_gene_merge%>%filter(gene_name=="ABCB1")
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_dPC3WTvsTXR_s20.pdf")
Taxol_damid_score_gene_merge %>% ggplot(., aes(x= RPE_dPC3_WT_LB1, y= RPE_dPC3Mi_TXR_LB1))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_s20 vs dPC3,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes(x= RPE_dPC3_WT_LB1, y= RPE_dPC3Mi_TXR_LB1), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)
#dev.off()
Now according to Anna the original parental control for TXR_s20 (or RPOE20) is RPE0. This dataset has LB2 data and not LB1, but let see how it looks.
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Scatter_Plot_RPE0vsTXR_s20.pdf")
Taxol_damid_score_gene_merge %>% ggplot(., aes(x= RPE0, y= RPE_dPC3Mi_TXR_LB1))+ rasterize(geom_point(alpha=0.1, size=0.3), dpi=300) + stat_cor(method = "pearson", label.x = -2, label.y= 2.2) +ggtitle("TXR_s20 vs RPE0,ABCB1, 20 kb_bins")+ theme_bw()+ geom_abline()+ geom_point(data=ABCB1, aes( x= RPE0, y= RPE_dPC3Mi_TXR_LB1), color="red", size=2)+geom_text(data=ABCB1,aes(label=gene_name ),color="black", hjust=0, vjust=1.5)
#dev.off()
Now let’s calculate the delta for all TXR vs relative control
Taxol_damid_score_gene_merge <-Taxol_damid_score_gene_merge%>%
dplyr::mutate(Delta_TXRs20=RPE_dPC3Mi_TXR_LB1-RPE_dPC3_WT_LB1)%>%
dplyr::mutate(Delta_TXR3=TXR3-DPURO_3)%>%
dplyr::mutate(Delta_TXR4=TXR4-DPURO_3)%>%
dplyr::mutate(Delta_TXR5=TXR5-RPE0)%>%
dplyr::mutate(Delta_TXR6=TXR6-RPE0)%>%
dplyr::mutate(Delta_ANCHOR_P_TXR=ANCHOR_P_TXR-ANCHOR_P)%>%
dplyr::mutate(Delta_ANCHOR_c7_TXR=ANCHOR_c7_TXR_r1-ANCHOR_c7_r1)
ABCB1_delta<-Taxol_damid_score_gene_merge%>%filter(gene_name=="ABCB1")%>%select(gene_name,Delta_TXRs20,Delta_TXR3,Delta_TXR4, Delta_TXR5, Delta_TXR6, Delta_ANCHOR_P_TXR, Delta_ANCHOR_c7_TXR)
ABCB1_delta_gathered<-ABCB1_delta%>%gather(Delta_TXRs20,Delta_TXR3,Delta_TXR4, Delta_TXR5, Delta_TXR6, Delta_ANCHOR_P_TXR, Delta_ANCHOR_c7_TXR,key ="Clone", value = "Delta")
t.test(ABCB1_delta_gathered$Delta, alternative = "less")
##
## One Sample t-test
##
## data: ABCB1_delta_gathered$Delta
## t = -3.1993, df = 6, p-value = 0.009309
## alternative hypothesis: true mean is less than 0
## 95 percent confidence interval:
## -Inf -0.3986421
## sample estimates:
## mean of x
## -1.015347
p_val <- 0.009309
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Detachment_score_allclones.pdf")
ABCB1_delta%>%gather(Delta_TXRs20,Delta_TXR3,Delta_TXR4, Delta_TXR5, Delta_TXR6, Delta_ANCHOR_P_TXR, Delta_ANCHOR_c7_TXR,key ="Clone", value = "Delta")%>%ggplot(., aes(x=gene_name, y=Delta, color=Clone))+geom_beeswarm()+geom_hline(yintercept = 0, linetype=2)+ylab("Detachment score")+theme_bw()+ ylim(-2.5,0.5)+ annotate("text",x=1,y=0.3,label=paste0('atop(bold("p = ',p_val,'"))'),cex=4,parse=TRUE)
#dev.off()
All_corr<-Taxol_damid_score_gene_merge%>%
dplyr::select(`RPE_dPC3_TXR3_LaminB2_R1-gatc.counts.txt.gz`,`RPE_dPC3_TXR3_LaminB2_R2-gatc.counts.txt.gz`,
`RPE_dPC3_TXR4_LaminB2_R1-gatc.counts.txt.gz`,`RPE_dPC3_TXR4_LaminB2_R2-gatc.counts.txt.gz`,
`RPE_dPC3_TXR4_LaminB2_R2-gatc.counts.txt.gz`,`RPE_dPC3_WT_LaminB2_R1-gatc.counts.txt.gz`,
`RPE_iCut_5AZA_62_5_LaminB2_R1-gatc.counts.txt.gz`,`RPE_iCut_5AZA_62_5_LaminB2_R2-gatc.counts.txt.gz`,
`RPE_iCut_BIX_2_LaminB2_R1-gatc.counts.txt.gz`,`RPE_iCut_BIX_2_LaminB2_R2-gatc.counts.txt.gz`,
`RPE_iCut_DMSO_LaminB2_R1-gatc.counts.txt.gz`,`RPE_iCut_DMSO_LaminB2_R2-gatc.counts.txt.gz`,
`RPE_iCut_GSK126_500_LaminB2_R1-gatc.counts.txt.gz`,`RPE_iCut_GSK126_500_LaminB2_R2-gatc.counts.txt.gz`,
`RPE_iCut_LMNAKO_LaminB2_R1-gatc.counts.txt.gz`,`RPE_iCut_LMNAKO_LaminB2_R2-gatc.counts.txt.gz`,
`RPE_iCut_LMNB1KO_LaminB2_R1-gatc.counts.txt.gz`,`RPE_iCut_LMNB1KO_LaminB2_R2-gatc.counts.txt.gz`,
`RPE_dPC3Mi_TxR_s20_LaminB1_R1-gatc.counts.txt.gz`,`RPE_dPC3Mi_TxR_s20_LaminB1_R2-gatc.counts.txt.gz`,
`pADamID-DPURO3_r1_Lmnb2-gatc.counts.txt.gz`,`pADamID-DPURO3_r2_Lmnb2-gatc.counts.txt.gz`,
`ANCHOR_c7_r1`,`ANCHOR_c7_TXR_r1`,
`pADamID-iCA3P_r1_Lmnb2-gatc.counts.txt.gz`,`pADamID-iCA3P_r2_Lmnb2-gatc.counts.txt.gz`,
`pADamID-iCA3P_TXR_r1_Lmnb2-gatc.counts.txt.gz`, `pADamID-iCA3P_TXR_r2_Lmnb2-gatc.counts.txt.gz`, `pADamID-RPE0_r1_Lmnb2-gatc.counts.txt.gz`, `pADamID-RPE0_r2_Lmnb2-gatc.counts.txt.gz`,
`pADamID-TXR3_r1_Lmnb2-gatc.counts.txt.gz`, `pADamID-TXR3_r2_Lmnb2-gatc.counts.txt.gz`,
`pADamID-TXR4_r1_Lmnb2-gatc.counts.txt.gz`, `pADamID-TXR4_r2_Lmnb2-gatc.counts.txt.gz`, `pADamID-TXR5_r1_Lmnb2-gatc.counts.txt.gz`,`pADamID-TXR5_r2_Lmnb2-gatc.counts.txt.gz`, `pADamID-TXR6_r1_Lmnb2-gatc.counts.txt.gz`,`pADamID-TXR6_r2_Lmnb2-gatc.counts.txt.gz`,`RPE_CRISPRa_ABCB1_LaminB1_R1-gatc.counts.txt.gz`,`RPE_CRISPRa_ABCB1_LaminB1_R2-gatc.counts.txt.gz`, `RPE_CRISPRa_ABCB4_LaminB1_R1-gatc.counts.txt.gz`, `RPE_CRISPRa_ABCB4_LaminB1_R2-gatc.counts.txt.gz`,`RPE_CRISPRa_combo_LaminB1_R1-gatc.counts.txt.gz`, `RPE_CRISPRa_combo_LaminB1_R2-gatc.counts.txt.gz`,`RPE_CRISPRa_RUNCD3B_LaminB1_R1-gatc.counts.txt.gz`,`RPE_CRISPRa_RUNCD3B_LaminB1_R2-gatc.counts.txt.gz`,`RPE_CRISPRa_WT_LaminB1_R1-gatc.counts.txt.gz`, `RPE_CRISPRa_WT_LaminB1_R2-gatc.counts.txt.gz`)%>%
cor()
All_corr_df<-as.data.frame(All_corr)
write.csv2(All_corr_df,"/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Allcorrelation.csv", row.names = TRUE)
require(pheatmap)
## Loading required package: pheatmap
require(RColorBrewer)
#pdf("/DATA/usr/s.manzo/Projects/SGM20220704_pA_DamID_seq_AnnaB5_NewABCB1_clones/pdf/Allcorrelation.pdf")
pheatmap(All_corr)
#dev.off()
#A <- matrix(rnorm(25, 0, 5), nrow = 5, ncol = 5)
#print(A)
# Plot a heatmap
#heatmap(A,Rowv=NA,Colv=NA,col=heat.colors(3))
# Plot a corresponding legend
#legend(x="right", legend=c("min", "med", "max"),fill=heat.colors(3))